h
x
x
h
h
x
h
x
•
x
•
f
•
h
=
f
(
x
)
V
W
h=
f(
x
)
r=
g(
h
)
x
x
r
h
f
x
h
g
h
r
•
g
•
r
=
g
(
h
) =
g
(
f
(
x
))
•
L
L
(
r
,
x
)
r
x
L
{
x
}
h
x
h
x
h
x
input
!x!
Code*bo,leneck
!h:!
undercomplete*
representa4on*
reconstruc4on
!r!
Decoder
*
Encoder*
Decoder
*
Encoder*
Code
!h:!
overcomplete*
representa4on*
h
x
h
r
=
x
x
x
h
||
∂
h
i
∂
x
||
||
∂
h
i
∂
x
||
h
i
x
•
h
∗
=
f
(
x
) = arg
min
h
L
(
g
(
h
)
,
x
))
+
λ
Ω(
h
)
L
f
g
Ω(
h
)
Ω(
h
)
|
h
|
1
=
i
|
h
i
|
t
i
log(1
+
α
2
h
2
i
)
αh
i
t
−
i
(
t
log
h
i
+
(1
−
t
)
log(1
−
h
i
))
,
t
h
i
∈
(0
,
1)
•
•
•
||
∂
h
∂
x
||
2
F
∂
h
i
(
x
)
∂
x
h
i
x
x
x
x
h
h
i
= 0
x
r
=
g
(
f
(
x
))
x
x
+
x
r
x
x
x
f
(
x
)
||
f
(
x
)
−
f
(
y
)
||
<
||
x
−
y
||
x
y
||
f
(
x
)
||
<
1
h
−
log
p
(
h
)
p
(
h
)
f
h
−
log
p
(
h
)
h
x
f
g
L
L
x
p
(
x
|
h
)
L
p
(
x
|
h
)
x
h
x
h
h
p
(
x
|
h
) =
i
p
(
x
i
|
h
)
x
i
|
h
x
i
|
h
x
i
x
h
Q
(
h
|
x
)
P
(
x
|
h
)
q
(
h
|
x
)
p
(
x
|
h
)
p
=
q
p
(
x
,
h
)
g
(
h
)
p
(
x
|
h
)
f
(
x
)
q
(
h
|
x
)
h
q
(
h
|
x
)
p
(
x
|
h
)
q
(
h
|
x
)
=
p
(
h
|
x
)
q
(
h
|
x
)
p
(
x
|
h
)
q
(
h
|
x
)
p
(
x
|
h
)
P
(
x
|
h
)
h
∼
P
(
h
)
x
=
Wh
+
b
+
noise
x
h
p
(
h
)
h
x
h
∼
p
(
h
)
,
x
=
W
h
+
b
+
noise
h
∼
N
(0
,
I
)
x
i
h
ψ
= diag(
σ
2
)
σ
2
= (
σ
2
1
,
σ
2
2
,
.
.
.
)
x
i
x
x
∼
N
(
b
,
W
W
+
ψ
)
W
x
i
x
j
x
i
ˆ
h
k
=
W
k
x
w
ki
k
ˆ
h
k
x
j
w
kj
σ
i
x
W
W
+
σ
2
I
σ
2
x
∼
N
(
b
,
W
W
+
σ
2
I
)
x
=
W
h
+
b
+
σ
z
z
∼
N
(0
,
I
)
W
σ
2
h
σ
2
σ
→
0
h
x
d
W
h
i
x
σ
→
0
d
W
p
(
h
) =
i
p
(
h
i
)
.
p
(
x
|
h
)
p
(
h
)
h
h
h
=
U
z
U
z
=
U
h
,
h
(0
,
I
)
z
V
ar
[
z
] =
E
[
z
z
] =
E
[
U
hh
U
] =
U
V
ar
[
h
]
U
=
U
U
=
I
.
x
s
=
V
x
h
x
i
h
j
h
j
x
=
f
(
h
)
+
noise
h
p
(
h
)
p
(
h
) =
i
p
(
h
i
) =
i
λ
2
e
−
λ
|
h
i
|
t
p
(
h
) =
i
p
(
h
i
)
∝
i
1
1
+
h
2
i
ν
ν
+1
2
.
h
i
= 0
p
(
h
|
x
)
h
=
0
h
h
=
f
(
x
) = arg
min
h
L
(
g
(
h
)
,
x
))
+
λ
Ω(
h
)
L
(
g
(
h
)
,
x
)
−
log
p
(
x
|
g
(
h
))
Ω(
h
)
−
log
p
(
h
)
h
i
p
(
h
)
x
h
L
=
−
log
p
(
x
|
h
)
h
x
x
f
g
h
=
f
(
x
)
L
=
−
log
P
(
x
|
g
(
f
(
x
)))
g
x
h
=
f
(
x
)
L
=
−
log
p
(
x
|
g
(
f
(
x
)))
g
(
f
(
x
))
g
(
f
(
x
))
p
(
x
|
h
)
p
(
x
|
h
)
=
i
p
(
x
i
|
h
)
x
i
|
h
g
(
h
)
p
(
x
|
h
) =
p
(
x
|
g
(
h
))
p
(
x
|
h
)
p
(
x
)
q
(
h
|
x
)
q
(
h
|
x
)
p
(
h
)
h
=
f
(
x
)
h
∗
(
x
) = arg
max
h
log
p
(
h
|
x
) = arg
min
h
||
x
−
(
b
+
W
h
)
||
2
σ
2
−
log
p
(
h
)
σ
2
p
(
h
)
h
= 0
p
(
h
i
) =
λ
2
e
λ
|
h
i
|
t
p
(
h
i
)
∝
1
(1
+
h
2
i
ν
)
ν
+1
2
.
x
h
x
x
λ
x
h
Ω(
h
)
L
=
−
log
p
(
x
|
g
(
h
))
+
Ω(
h
)
g
(
h
)
h
=
f
(
x
)
Ω(
h
)
h
λ
2
e
−
λ
|
h
i
|
Ω(
h
) =
λ
i
|
h
i
|
−
log
p
(
h
) =
i
log
λ
2
+
λ
|
h
i
|
= const
+
Ω(
h
)
λ
h
λ
t
Ω(
h
) =
i
ν
+
1
2
log(1
+
h
2
i
ν
)
ν
h
i
ρ
= 0
.
05
Ω(
h
) =
i
ρ
log
h
i
+
(1
−
ρ
)
log(1
−
h
i
)
0
<
h
i
<
1
h
i
= sigmoid(
a
i
)
p
=
h
i
p
=
ρ
h
x
θ
arg
max
θ
p
(
θ
|
x
) = arg
max
θ
(log
p
(
x
|
θ
)
+
log
p
(
θ
))
θ
h
f
(
x
)
L
= arg
min
h
||
x
−
g
(
h
)
||
2
+
λ
|
h
|
1
+
γ
||
h
−
f
(
x
)
||
2
f
g
x
h
h
=
f
(
x
)
h
h
g
f
f
f
g
f
f
f
x
˜
x
C
(˜
x
|
x
)
f
g
L
=
−
log
P
(
x
|
g
(
f
(˜
x
)))
h
=
f
(˜
x
)
x
˜
x
L
=
−
log
p
(
x
|
g
(
f
(
˜
x
)))
˜
x
x
C
(
˜
x
|
x
)
C
(
˜
x
|
x
)
˜
x
x
p
(
x
|
˜
x
)
(
x
,
˜
x
)
x
=
x
˜
x
=
˜
x
C
(
˜
x
|
x
=
x
)
(
x
,
˜
x
)
p
(
x
|
˜
x
)
=
p
(
x
|
g
(
h
))
h
f
(
˜
x
)
g
(
h
)
−
log
p
(
x
|
h
)
x
x
˜
x
−
E
x
∼
q
(
x
)
E
˜
x
∼
C
(
˜
x
|
x
)
log
p
(
x
|
g
(
f
(
˜
x
)))
q
(
x
)
C
(˜
x
|
x
)
x
˜
x
˜
x
g
(
f
(˜
x
))
≈
E
[
x
|
˜
x
]
x
˜
x
C
(
˜
x
|
x
)
x
||
g
(
f
(
˜
x
))
−
x
||
2
g
(
f
(
˜
x
))
E
[
x
|
˜
x
]
x
˜
x
g
(
f
(
x
))
−
x
∂
log
q
(
x
)
∂
x
q
(
g
(
f
(
x
))
−
x
)
∂
log
q
(
x
)
∂
x
x
||
g
(
f
(
˜
x
))
−
x
||
2
C
(
˜
x
=
˜
x
|
x
) =
N
(
˜
x
;
µ
=
x
,
Σ =
σ
2
I
)
σ
2
g
(
f
(
x
))
−
x
σ
2
∂
log
q
(
x
)
∂
x
q
(
x
)
g
(
f
(
x
))
−
x
σ
2
→
∂
log
q
(
x
)
∂
x
,
f
g
g
(
f
(
x
))
x
x
g
(
f
(
x
))
−
x
h
=
f
(
x
)
f
Ω(
h
) =
∂
f
(
x
)
∂
x
2
F
h
h
x
∂
f
(
x
)
∂
x
f
(
x
)
x
∂
f
(
x
)
∂
x
f
h
∂
f
(
x
)
∂
x
×
∂
h
∂
x
f
g
f
f
g
g
f